{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "private_outputs": true,
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/mikexcohen/Statistics_book/blob/main/stats_ch05_simulating_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Modern statistics: Intuition, Math, Python, R\n",
    "## Mike X Cohen (sincxpress.com)\n",
    "#### https://www.amazon.com/dp/B0CQRGWGLY\n",
    "#### Code for chapter 5 (simulating data)\n",
    "\n",
    "---\n",
    "\n",
    "# About this code file:\n",
    "\n",
    "### This notebook will reproduce most of the figures in this chapter (some figures were made in Inkscape), and illustrate the statistical concepts explained in the text. The point of providing the code is not just for you to recreate the figures, but for you to modify, adapt, explore, and experiment with the code.\n",
    "\n",
    "### Solutions to all exercises are at the bottom of the notebook.\n",
    "\n",
    "#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE."
   ],
   "metadata": {
    "id": "yeVh6hm2ezCO"
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "dFT1qwVEyxTW"
   },
   "outputs": [],
   "source": [
    "# import libraries and define global settings\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# define global figure properties used for publication\n",
    "import matplotlib_inline.backend_inline\n",
    "matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format\n",
    "plt.rcParams.update({'font.size':14,             # font size\n",
    "                     'savefig.dpi':300,          # output resolution\n",
    "                     'axes.titlelocation':'left',# title location\n",
    "                     'axes.spines.right':False,  # remove axis bounding box\n",
    "                     'axes.spines.top':False,    # remove axis bounding box\n",
    "                     })"
   ]
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "CfjW5mgXn_CI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.2: Normally distributed random data"
   ],
   "metadata": {
    "id": "Wm_MLTTdy6K0"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters for the distributions\n",
    "means = [ -1,0,1,3 ]\n",
    "stds  = [1,.1,3,1.6 ]\n",
    "samplesize = 2500\n",
    "\n",
    "_,axs = plt.subplots(2,2,figsize=(10,8))\n",
    "\n",
    "for idx,axis in enumerate( axs.flatten() ):\n",
    "\n",
    "  # generate some data\n",
    "  X = np.random.normal(loc=means[idx],scale=stds[idx],size=samplesize)\n",
    "\n",
    "  # compute empirical mean and std\n",
    "  empave = np.mean(X)\n",
    "  empstd = np.std(X,ddof=1)\n",
    "\n",
    "  # draw the histogram using the F-D rule for bin width\n",
    "  axis.hist(X,bins='fd',color='gray',edgecolor='gray')\n",
    "  axis.set(xlim=[-15,15],ylabel='Count')\n",
    "  axis.set_title(f'$\\\\mu$={means[idx]}, $\\\\overline{{X}}$={empave:.3f}\\n$\\\\sigma$={stds[idx]}, std(X)={empstd:.3f}',loc='center')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_normal4examples.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "QLmGlnPP-vRF"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "2ImGERoS-vTa"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.4: Uniformly distributed random data"
   ],
   "metadata": {
    "id": "VoqSOpMBy5_d"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "a,b = np.sort( np.random.randint(-3,11,2) )\n",
    "N = 1001\n",
    "\n",
    "Y = np.random.uniform(a,b,size=N)\n",
    "\n",
    "print(a,b,np.min(a),np.max(b))\n",
    "print('')\n",
    "print(np.mean(Y),(a+b)/2,np.median(Y))\n",
    "print(np.var(Y,ddof=1),(b-a)**2/12)"
   ],
   "metadata": {
    "id": "CPdaOPT418Z7"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters for the distributions\n",
    "aa = [ -1,  0, 2,  1  ]\n",
    "bb = [  1, .1, 3, 1.6 ]\n",
    "samplesize = 2500\n",
    "\n",
    "_,axs = plt.subplots(2,2,figsize=(10,8))\n",
    "\n",
    "for idx,axis in enumerate( axs.flatten() ):\n",
    "\n",
    "  # generate some data\n",
    "  X = np.random.uniform(aa[idx],bb[idx],size=samplesize)\n",
    "\n",
    "  # compute empirical boundaries\n",
    "  bndL = np.min(X)\n",
    "  bndU = np.max(X)\n",
    "\n",
    "  # draw the histogram using the F-D rule for bin width\n",
    "  axis.hist(X,bins='fd',color='gray',edgecolor='gray')\n",
    "  axis.set(xlim=[-1.5,3.5],ylabel='Count')\n",
    "  axis.set_title(f'a={aa[idx]}, min(Y)={bndL:.3f}\\n b={bb[idx]}, max(Y)={bndU:.3f}',loc='center')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_uniform4examples.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "QHwgoJKv18f7"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "M8-4ubi0uBYW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.5: Example Weibull distribution"
   ],
   "metadata": {
    "id": "Is2QE3p6uBVq"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# some data\n",
    "X = np.random.weibull(2,5000)\n",
    "\n",
    "# create a histogram\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.hist(X,'fd',color=(.8,.8,.8),edgecolor='k')\n",
    "plt.xlabel('Data values')\n",
    "plt.ylabel('Counts')\n",
    "plt.title('Example Weibull distribution',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_weibull.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "mIAWcaOLuBS3"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "O_2kxaqhuBPx"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.6: Example log-normal distribution"
   ],
   "metadata": {
    "id": "MjQcbAI_x1pV"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# generate the data using a normal distribution passed through an exponential\n",
    "Y = np.random.normal(0,1,size=5000)\n",
    "X = np.exp( Y*.5 + 1 )\n",
    "\n",
    "# you can also use the built-in numpy function\n",
    "#X = np.random.lognormal(1,1/2,5000)\n",
    "\n",
    "# create a histogram\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.hist(X,bins=80,color=(.8,.8,.8),edgecolor='k')\n",
    "plt.xlabel('Data values')\n",
    "plt.ylabel('Counts')\n",
    "plt.title('Example log-normal distribution',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_lognormal.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "WHJluH0Rx1pa"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "14tEUUUOZe_s"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Random integers"
   ],
   "metadata": {
    "id": "Eh5cPRao18jE"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "z = np.random.randint(-5,6,size=10000)\n",
    "\n",
    "plt.hist(z,bins=len(set(z)));"
   ],
   "metadata": {
    "id": "EhKbAcCbGl5o"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "z = np.round(np.random.randn(10000)*10)\n",
    "print(np.unique(z))\n",
    "plt.hist(z);"
   ],
   "metadata": {
    "id": "iTgNl2f318l-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "PyqagkZ-18ov"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Random selection"
   ],
   "metadata": {
    "id": "1rtVvHvb18uU"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "s = [1,2,np.pi,10]\n",
    "np.random.choice(s,1)"
   ],
   "metadata": {
    "id": "NP4mULrN18xY"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# not limited to numbers\n",
    "t = ['a','b','hello']\n",
    "np.random.choice(t,1)"
   ],
   "metadata": {
    "id": "9Es8nD1khFc-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "np.random.choice(s,4)"
   ],
   "metadata": {
    "id": "EpaQOwPX180E"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "np.random.choice(s,4,replace=False)"
   ],
   "metadata": {
    "id": "a9bolwi2183E"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "WxoU2kqx186E"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Random permutations"
   ],
   "metadata": {
    "id": "5n62_Eon19AE"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "l = np.arange(5)\n",
    "print(l)\n",
    "print(np.random.permutation(l))"
   ],
   "metadata": {
    "id": "nJ2zToE0zb8c"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# to randomly re-sort a dataset\n",
    "\n",
    "theData = np.arange(-3,4)**3\n",
    "theData"
   ],
   "metadata": {
    "id": "FjX0TXnF19DI"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "newIdx = np.random.permutation(len(theData))\n",
    "shufData = theData[newIdx]\n",
    "\n",
    "print(theData)\n",
    "print(newIdx)\n",
    "print(shufData)"
   ],
   "metadata": {
    "id": "Hf_cw47F19F4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "H6MVNc0PSB_-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Seeding the rng"
   ],
   "metadata": {
    "id": "QsgO9Sz1SB3p"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "np.random.randn(3,3)"
   ],
   "metadata": {
    "id": "nKOyKRUky5lC"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "rs = np.random.RandomState(17)\n",
    "rs.randn(3,3)"
   ],
   "metadata": {
    "id": "sI7S8qVSTnpr"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "DY6TN2v-Tnmk"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.8: Running an experiment"
   ],
   "metadata": {
    "id": "eSleqfsWTnjb"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# the key factor to manipulate\n",
    "stds = np.linspace(.01,10,40)\n",
    "\n",
    "# parameters to hold constant\n",
    "samplesize = 100\n",
    "mean = 0\n",
    "\n",
    "\n",
    "# initialize results matrix\n",
    "results = np.zeros(len(stds))\n",
    "\n",
    "\n",
    "# start the experiment\n",
    "for stdi in range(len(stds)):\n",
    "\n",
    "  # data parameters for this experiment run\n",
    "  thisStd = stds[stdi]\n",
    "\n",
    "  # generate data\n",
    "  data = np.random.normal(mean,thisStd,samplesize)\n",
    "\n",
    "  # collect results\n",
    "  results[stdi] = np.mean(data)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(stds,results,'ks',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "\n",
    "# add to the plot\n",
    "plt.axhline(mean,linestyle='--',color=(.3,.3,.3),zorder=-1)\n",
    "plt.xlabel('Population standard deviation')\n",
    "plt.ylabel('Empirical mean')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_experiment1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "QcD5asCXiMrt"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "tYQEl5HNp0U-"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Figure 5.9: A follow-up experiment"
   ],
   "metadata": {
    "id": "VSzrx1jXp0qo"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "## the next experiment with two factors to manipulate\n",
    "\n",
    "# the key factors to manipulate\n",
    "stds = np.linspace(.01,10,40)\n",
    "samplesizes = [100,10000]\n",
    "\n",
    "# parameters to hold constant\n",
    "meanvalue = 0\n",
    "\n",
    "\n",
    "# initialize results matrix\n",
    "results = np.zeros((len(stds),len(samplesizes)))\n",
    "\n",
    "# start the experiment\n",
    "for stdi in range(len(stds)):\n",
    "\n",
    "  for sampi in range(len(samplesizes)):\n",
    "\n",
    "    # data parameters for this experiment run\n",
    "    thisStd = stds[stdi]\n",
    "    thisN = samplesizes[sampi]\n",
    "\n",
    "    # generate data\n",
    "    data = np.random.normal(meanvalue,thisStd,samplesizes[sampi])\n",
    "\n",
    "    # collect results\n",
    "    results[stdi,sampi] = np.mean(data)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(stds,results[:,0],'ks',markersize=10,markerfacecolor=(.9,.9,.9),label=f'N = {samplesizes[0]}')\n",
    "plt.plot(stds,results[:,1],'ko',markersize=10,markerfacecolor=(.5,.5,.5),label=f'N = {samplesizes[1]}')\n",
    "\n",
    "plt.axhline(meanvalue,linestyle='--',color=(.3,.3,.3),zorder=-1)\n",
    "\n",
    "plt.xlabel('Population standard deviation')\n",
    "plt.ylabel('Empirical mean')\n",
    "\n",
    "plt.legend(loc='upper left')\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_experiment2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "RqUd0Ah8fJ0Q"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "TAFK1iAhfJ3p"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 1"
   ],
   "metadata": {
    "id": "fCCvq5xTTnbH"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Reminder: the second input in np.random.normal is the standard deviation whereas the exercise specified the variance.\n",
    "# You can input the square root of the variance.\n",
    "X = np.random.normal(0,np.sqrt(2),size=10000)\n",
    "\n",
    "# report the mean and variance\n",
    "print(f'Empirical mean = {np.mean(X):.3f}')\n",
    "print(f'Empirical variance = {np.var(X,ddof=1):.3f}')"
   ],
   "metadata": {
    "id": "eXro4KyI-HoK"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# sample sizes\n",
    "Ns = np.arange(10,10200,step=200)\n",
    "\n",
    "# initialize outputs\n",
    "means = np.zeros(len(Ns))\n",
    "vars  = np.zeros(len(Ns))\n",
    "\n",
    "# run experiment\n",
    "for i,n in enumerate(Ns):\n",
    "\n",
    "  # generate random data\n",
    "  X = np.random.normal(0,np.sqrt(2),size=n)\n",
    "  # Note: the second input in np.random.normal is the standard deviation whereas the exercise specified the variance. But 1**2==1, so it works out here.\n",
    "\n",
    "  # compute mean and variance\n",
    "  means[i] = np.mean(X)\n",
    "  vars[i]  = np.var(X,ddof=1)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "_,axs = plt.subplots(2,1,figsize=(8,6))\n",
    "\n",
    "axs[0].plot(Ns[[0,-1]],[0,0],'--',color=(.4,.4,.4))\n",
    "axs[0].plot(Ns,means,'ks',markerfacecolor=(.9,.9,.9),markersize=10)\n",
    "axs[0].set(xlim=[-100,Ns[-1]+110],xlabel='Sample size')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Mean value (expected 0)',loc='left')\n",
    "\n",
    "axs[1].plot(Ns[[0,-1]],[2,2],'--',color=(.4,.4,.4))\n",
    "axs[1].plot(Ns,vars,'ks',markerfacecolor=(.9,.9,.9),markersize=10)\n",
    "axs[1].set(xlim=[-100,Ns[-1]+110],ylabel='Variance')\n",
    "axs[1].set_xlabel('Sample size')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Variance value (expected 2)',loc='left')\n",
    "\n",
    "\n",
    "# final adjustments and export\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex1.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "R4MD7E6n4VXh"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "b3ccPdPE4VaO"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 2"
   ],
   "metadata": {
    "id": "d9dRgWpD4Vci"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# uniform data with boundaries [-3,8]\n",
    "a,b = -3,8\n",
    "Y = np.random.uniform(a,b,size=1324)\n",
    "\n",
    "# compute mean and variance discrepancies\n",
    "meanDiff = np.mean(Y) - (a+b)/2\n",
    "varDiff  = ( np.var(Y,ddof=1) - (b-a)**2/12 )**2\n",
    "\n",
    "# print the results\n",
    "print(f'Mean discrepancy (signed): {meanDiff:.3f}')\n",
    "print(f'Variance discrepancy (squared): {varDiff:.3f}')\n",
    "\n",
    "# histogram\n",
    "plt.hist(Y,bins='fd')\n",
    "plt.xlabel('Data values')\n",
    "plt.ylabel('Count')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "ZWEybhV-wf69"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# sample sizes\n",
    "Ns = np.arange(10,10200,step=200)\n",
    "\n",
    "# initialize outputs\n",
    "means = np.zeros(len(Ns))\n",
    "vars  = np.zeros(len(Ns))\n",
    "\n",
    "# run experiment\n",
    "for i,N in enumerate(Ns):\n",
    "\n",
    "  # generate random data (sorting to ensure a>b!)\n",
    "  a,b = np.sort( np.random.randint(-3,11,2) )\n",
    "  Y = np.random.uniform(a,b,size=N)\n",
    "\n",
    "  # compute mean and variance discrepancies\n",
    "  means[i] = np.mean(Y) - (a+b)/2\n",
    "  vars[i]  = ( np.var(Y,ddof=1) - (b-a)**2/12 )**2\n",
    "\n",
    "\n",
    "# plot the results\n",
    "_,axs = plt.subplots(2,1,figsize=(8,6))\n",
    "\n",
    "axs[0].plot(Ns[[0,-1]],[0,0],'--',color=(.4,.4,.4))\n",
    "axs[0].plot(Ns,means,'ks',markerfacecolor=(.9,.9,.9),markersize=8)\n",
    "axs[0].set(xlim=[-100,Ns[-1]+110],xlabel='Sample size',ylabel='Mean discrepancy')\n",
    "axs[0].set_title(r'$\\bf{A}$)  Mean value difference (expected 0)',loc='left')\n",
    "\n",
    "axs[1].plot(Ns[[0,-1]],[0,0],'--',color=(.4,.4,.4))\n",
    "axs[1].plot(Ns,vars,'ks',markerfacecolor=(.9,.9,.9),markersize=8)\n",
    "axs[1].set(xlim=[-100,Ns[-1]+110],ylabel='Squared var discrepancy')\n",
    "axs[1].set_xlabel('Sample size')\n",
    "axs[1].set_title(r'$\\bf{B}$)  Variance difference (expected 0)',loc='left')\n",
    "\n",
    "\n",
    "# export\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex2.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "9IcQ1pYy63yg"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Wjj-l1Mo4VfQ"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 3"
   ],
   "metadata": {
    "id": "JnioEj1s4ViB"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "mu = 2\n",
    "sigma = 1.5\n",
    "\n",
    "# normally distributed numbers\n",
    "normal = np.random.normal(0,1,size=10000)\n",
    "\n",
    "# transform to log-normal\n",
    "lognorm = np.exp(normal*sigma + mu)\n",
    "\n",
    "# compute the empirical mean\n",
    "empMean = np.mean(lognorm)\n",
    "\n",
    "# and back-transform\n",
    "empMeanInv = np.log(empMean) - sigma**2/2\n",
    "\n",
    "# report the mean and its transform\n",
    "print(f'Mean of log-normal data is {empMean:.3f}')\n",
    "print(f'Transformed mean of log-normal data is {empMeanInv:.3f}')"
   ],
   "metadata": {
    "id": "NFiXtBpD-zAj"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "mus = np.linspace(1,10,13)\n",
    "\n",
    "means = np.zeros(len(mus))\n",
    "\n",
    "for i,m in enumerate(mus):\n",
    "\n",
    "  # normally distributed data (different implementation compared to the previous code just to show multiple correct answers)\n",
    "  normal = np.random.randn(10000)*sigma + m\n",
    "\n",
    "  # transform to log-normal\n",
    "  lognorm = np.exp(normal)\n",
    "\n",
    "  # empirical mean\n",
    "  means[i] = np.mean( lognorm )\n",
    "\n",
    "  # Of course, you could condense this to one line:\n",
    "  #means[i] = np.mean( np.exp(np.random.randn(10000)*sigma+m) )\n",
    "\n",
    "\n",
    "# plotting\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "axs[0].plot(mus,means,'ks-',markerfacecolor=(.9,.9,.9),markersize=8)\n",
    "axs[0].set(xlabel=r'$\\mu$',ylabel=r'$\\overline{Y}$')\n",
    "\n",
    "axs[1].plot(mus,np.log(means)-sigma**2/2,'ks-',markerfacecolor=(.9,.9,.9),markersize=8)\n",
    "axs[1].set(xlabel=r'$\\mu$',ylabel=r'$\\ln(\\overline{Y})-\\frac{1}{2}\\sigma^2$')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex3.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "7-hDU-6G_len"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# you can also compare by subtraction:\n",
    "np.log(means)-sigma**2/2 - mus\n",
    "# discrepancies are not exactly zero, but they are around 6 orders of magnitude smaller than np.mean(lognorm)!"
   ],
   "metadata": {
    "id": "RGUD-uAo_lhz"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "-yq9-OCTSN3k"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 4"
   ],
   "metadata": {
    "id": "IK6UWHqH_lkU"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "a,b = np.sort( np.random.randint(-3,11,2) )\n",
    "N = 1001\n",
    "Y = np.random.uniform(a,b,size=N)\n",
    "\n",
    "# means according to the formulas\n",
    "mu_a = np.sqrt(3)*np.std(Y,ddof=1) + a\n",
    "mu_b = b - np.sqrt(3)*np.std(Y,ddof=1)\n",
    "\n",
    "# print\n",
    "print(f'mu from a : {mu_a:.4f}')\n",
    "print(f'mu from b : {mu_b:.4f}')\n",
    "print(f'mean(Y)   : {np.mean(Y):.4f}')\n",
    "print(f'avg bounds: {(a+b)/2:.4f}')"
   ],
   "metadata": {
    "id": "_yTwekJXcLE4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# stds according to the formulas\n",
    "sig_a = (np.mean(Y)-a)/np.sqrt(3)\n",
    "sig_b = (b-np.mean(Y))/np.sqrt(3)\n",
    "\n",
    "print(f'sigma from a : {sig_a:.4f}')\n",
    "print(f'sigma from b : {sig_a:.4f}')\n",
    "print(f'std(Y)       : {np.std(Y,ddof=1):.4f}')"
   ],
   "metadata": {
    "id": "xVLzgruf_lnT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# now to simulate data from mu/sigma parameters\n",
    "\n",
    "# desired parameters\n",
    "mu = 3.5\n",
    "sig = 1.8\n",
    "\n",
    "# generate the data\n",
    "U = np.random.uniform(0,1,size=100000)\n",
    "Y = mu + np.sqrt(3)*sig*(2*U-1)\n",
    "\n",
    "print(f'Empirical mean: {np.mean(Y):.3f}')\n",
    "print(f'Empirical std : {np.std(Y,ddof=1):.3f}')"
   ],
   "metadata": {
    "id": "3f6akYfk_lqM"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lChYUw754VnO"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 5"
   ],
   "metadata": {
    "id": "R1nsCZkBTnX-"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# Expected median is the same as the mean: (a+b)/2\n",
    "#\n",
    "# Expected mode is any/all values! The probability of any one value is the same as that of any other value.\n",
    "# Of course, in a finite sample, there is likely to be one or a small number of modes due to sampling variability."
   ],
   "metadata": {
    "id": "JBDfEiVLNbf_"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "Z9kH9dUFNbi_"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 6"
   ],
   "metadata": {
    "id": "2FQmbG5lNblf"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# reference: https://en.wikipedia.org/wiki/Triangular_distribution"
   ],
   "metadata": {
    "id": "T_UZi7ErhuNs"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# parameters\n",
    "a = .2\n",
    "c = .6\n",
    "b = .9\n",
    "N = 10000 # sample size\n",
    "\n",
    "# function F(c) (as defined in wiki page)\n",
    "Fc = (c-a) / (b-a)\n",
    "\n",
    "# initialize U and y\n",
    "U = np.random.rand(N)\n",
    "y = np.zeros(N)\n",
    "\n",
    "# apply transformation\n",
    "y[U<Fc] = a + np.sqrt( U[U<Fc]*(b-a)*(c-a) )\n",
    "y[U>Fc] = b - np.sqrt( (1-U[U>Fc])*(b-a)*(b-c) )\n",
    "\n",
    "\n",
    "# create a histogram\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.hist(y,'fd',color=(.8,.8,.8),edgecolor='k')\n",
    "plt.xlim([a-.2,b+.2])\n",
    "plt.xlabel('Data values')\n",
    "plt.ylabel('Counts')\n",
    "plt.title(f'Triangular distribution with a={a}, b={b}, c={c}',loc='center')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex6.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "5o3Oc8-iNboB"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# repeat using numpy function\n",
    "Y = np.random.triangular(a,c,b,size=N)\n",
    "\n",
    "# create a histogram\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.hist(Y,'fd',color=(.8,.8,.8),edgecolor='k')\n",
    "plt.xlim([a-.2,b+.2])\n",
    "plt.xlabel('Data values')\n",
    "plt.ylabel('Counts')\n",
    "plt.title(f'Triangular distribution with a={a}, b={b}, c={c}',loc='center')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "zsv_7EQWNbqx"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "L6qpKVhzNbtT"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 7"
   ],
   "metadata": {
    "id": "3h-mC_mtNbwL"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# create an normally distributed integer dataset from a standard normal\n",
    "\n",
    "# rounding a standard normal\n",
    "X = np.random.normal(loc=0,scale=1,size=100000)\n",
    "X = np.round(X)\n",
    "\n",
    "# notice there are only 9 unique values:\n",
    "print(np.unique(X))\n",
    "\n",
    "# the histogram looks... thin\n",
    "plt.hist(X,bins='fd')\n",
    "plt.title(f'{len(np.unique(X))} unique numbers',loc='center')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "oIiuMF8XqBPB"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Technically, the solution above is correct, but it feels unsatisfying to me,\n",
    "# because the number of unique elements is so small. A simple solution is to\n",
    "# increase the width of the distribution.\n",
    "\n",
    "\n",
    "# rounding a normal with larger sigma\n",
    "X = np.random.normal(loc=0,scale=15,size=100000)\n",
    "X = np.round(X)\n",
    "\n",
    "# the histogram looks better\n",
    "plt.hist(X,bins='fd')\n",
    "plt.title(f'{len(np.unique(X))} unique numbers',loc='center')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "O2b9eM6vqBR2"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "lh57xCa3qBUK"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 8"
   ],
   "metadata": {
    "id": "Ti6dqXTWNbyz"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "N = 100\n",
    "\n",
    "# create the data matrix\n",
    "M = np.random.randn(N,2)\n",
    "M[:,1] += M[:,0]\n",
    "\n",
    "# correlation coefficient\n",
    "# Note: the output is a matrix, and we want an off-diagonal element. You'll understand this in Chapter 12.\n",
    "r_real = np.corrcoef(M.T)[1,0]"
   ],
   "metadata": {
    "id": "OefEnWUuNb1n"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# now to shuffle the data\n",
    "ridx = np.random.permutation(N)\n",
    "\n",
    "# make a copy of the data\n",
    "Mshuf = M.copy()\n",
    "Mshuf[:,0] = Mshuf[ridx,0]\n",
    "\n",
    "# new correlation\n",
    "r_shuf = np.corrcoef(Mshuf.T)[1,0]\n",
    "\n",
    "# report the real and shuffled correlation coefficients\n",
    "# (not formally requested in the text, but a nice addition ;)  )\n",
    "print(f'Real correlation     r = {r_real:.3f}')\n",
    "print(f'Shuffled correlation r = {r_shuf:.3f}')"
   ],
   "metadata": {
    "id": "OQjPq10mHtpU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# plot\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "axs[0].plot(M[:,0],M[:,1],'ko')\n",
    "axs[0].set(xlabel='Variable x',ylabel='Variable y')\n",
    "axs[0].set_title(f'$\\\\bf{{A}}$) Real correlation r={r_real:.2f}')\n",
    "\n",
    "axs[1].plot(Mshuf[:,0],Mshuf[:,1],'ko')\n",
    "axs[1].set(xlabel='Variable x (shuffled)',ylabel='Variable y')\n",
    "axs[1].set_title(f'$\\\\bf{{B}}$) Shuffled correlation r={r_shuf:.2f}')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex8.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "TS1_opjYHqK1"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "kpDXyEm4Nb95"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 9"
   ],
   "metadata": {
    "id": "zEfjMSc0tkiU"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "import scipy.stats as stats\n",
    "\n",
    "# sample size\n",
    "N = 3000\n",
    "\n",
    "# run one of these lines...\n",
    "# data = stats.exponnorm.rvs(3,size=N)\n",
    "# data = stats.norm.rvs(loc=3,scale=1,size=N)\n",
    "data = stats.laplace.rvs(size=N)\n",
    "# data = stats.gumbel_r.rvs(size=N)\n",
    "\n",
    "# descriptives\n",
    "mean = np.mean(data)\n",
    "std  = np.std(data,ddof=1)\n",
    "\n",
    "# histograms via numpy\n",
    "yy,xx = np.histogram(data,bins='fd')\n",
    "xx = (xx[:-1]+xx[1:])/2\n",
    "\n",
    "# find bars within 1 std of the mean\n",
    "closeBars = np.logical_and(xx>(mean-std), xx<(mean+std))\n",
    "\n",
    "\n",
    "# show me the data!\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "# scatter plot of the data\n",
    "axs[0].plot(data,'o',color=(.3,.3,.3),markerfacecolor=(.8,.8,.8),alpha=.2)\n",
    "axs[0].axhline(mean-std,linestyle='--',linewidth=2,color='k')\n",
    "axs[0].axhline(mean+std,linestyle='--',linewidth=2,color='k')\n",
    "\n",
    "axs[0].set(xlabel='Data index',ylabel='Data value',xlim=[-10,len(data)+10])\n",
    "axs[0].set_title(r'$\\bf{A})$  Raw data values')\n",
    "\n",
    "# now show the histogram\n",
    "axs[1].bar(xx[~closeBars],yy[~closeBars],width=30/len(xx),color=(.7,.7,.7),edgecolor=None,label=r'$x>(\\overline{x}\\pm\\sigma)$')\n",
    "axs[1].bar(xx[closeBars],yy[closeBars],width=30/len(xx),color=(.2,.2,.2),edgecolor=None,label=r'$x\\in(\\overline{x}\\pm\\sigma)$')\n",
    "\n",
    "axs[1].set(xlabel='Data value',ylabel='Count')\n",
    "axs[1].legend()\n",
    "axs[1].set_title(r'$\\bf{B})$  Data histogram')\n",
    "\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex9.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "y7yfaRY2tkfM"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# Link to a list of distributions you can try\n",
    "# https://docs.scipy.org/doc/scipy/reference/stats.html#continuous-distributions\n",
    "# (For this exercise, use only 'continuous' distributions.)"
   ],
   "metadata": {
    "id": "ta8B-9djtkcU"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "BWsan_rctkW4"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "markdown",
   "source": [
    "# Exercise 10"
   ],
   "metadata": {
    "id": "JRkpGGKR3tnv"
   }
  },
  {
   "cell_type": "code",
   "source": [
    "# the key factors to manipulate\n",
    "stds = np.linspace(.01,10,40)\n",
    "\n",
    "# parameters to hold constant\n",
    "samplesize = 10000\n",
    "mean = 0\n",
    "\n",
    "\n",
    "# initialize results matrix\n",
    "results = np.zeros(len(stds))\n",
    "expectedMean = np.zeros(len(stds))\n",
    "\n",
    "\n",
    "# start the experiment\n",
    "for stdi in range(len(stds)):\n",
    "\n",
    "  # data parameters for this experiment run\n",
    "  thisStd = stds[stdi]\n",
    "\n",
    "  # generate data\n",
    "  X = np.random.normal(0,1,samplesize)\n",
    "  data = np.exp(X*thisStd + mean)\n",
    "\n",
    "  # collect results\n",
    "  results[stdi] = np.mean(data)\n",
    "\n",
    "  # expected average\n",
    "  expectedMean[stdi] = np.exp(mean + thisStd**2/2)\n",
    "\n",
    "\n",
    "# plot the results\n",
    "plt.figure(figsize=(8,4))\n",
    "plt.plot(stds,np.log(results),'ks',markersize=10,markerfacecolor=(.9,.9,.9))\n",
    "plt.plot(stds,np.log(expectedMean),'k--')\n",
    "\n",
    "# add to the plot\n",
    "plt.xlabel('Standard deviation')\n",
    "plt.ylabel('Empirical mean (log)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex10a.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "U-g3wK183tlX"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "# values of sigma to illustrate\n",
    "sigmas = [1,10]\n",
    "\n",
    "# prepare the figure\n",
    "_,axs = plt.subplots(1,2,figsize=(10,4))\n",
    "\n",
    "\n",
    "# make the figure\n",
    "for i,s in enumerate(sigmas):\n",
    "\n",
    "  # generate data\n",
    "  Y = np.exp(np.random.randn(10000)*s)\n",
    "\n",
    "  # plot the results\n",
    "  axs[i].hist(np.log(Y),bins=100,color='k',label='Data')\n",
    "  axs[i].axvline(x=np.log(np.mean(Y)),color='gray',label='Mean')\n",
    "\n",
    "\n",
    "# prettify both axes\n",
    "axs[0].set_title(f'$\\\\bf{{A}}$)  $\\\\sigma$ = {sigmas[0]}',loc='left')\n",
    "axs[1].set_title(f'$\\\\bf{{B}}$)  $\\\\sigma$ = {sigmas[1]}',loc='left')\n",
    "for a in axs:\n",
    "  a.set(xlabel='ln(data) value',ylabel='Count')\n",
    "  a.legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('simdat_ex10b.png')\n",
    "plt.show()"
   ],
   "metadata": {
    "id": "hzGkN8Dt3tbW"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "sVC04GFl_dSr"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}